Clustering

Some customers didn't write a review. But why are they happy or mad?

In [1]:
import numpy as np
import pandas as pd 
import os
import holoviews as hv
import geoviews as gv
import datashader as ds
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr
from datashader.colors import colormap_select, Greys9
from holoviews.streams import RangeXY
from holoviews.operation.datashader import datashade, dynspread, rasterize
from bokeh.io import push_notebook, show, output_notebook
import warnings

warnings.filterwarnings('ignore')

geo = pd.read_csv("../dataset/olist_geolocation_dataset.csv", dtype={'geolocation_zip_code_prefix': str})

# Gets the first three and four first digits of zip codes, we will explore this further to understand how zip codes works
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix'].str[0:1]
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix'].str[0:2]
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix'].str[0:3]
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix'].str[0:4]

# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = geo[geo.geolocation_lat <= 5.27438888]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = geo[geo.geolocation_lng >= -73.98283055]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = geo[geo.geolocation_lat >= -33.75116944]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = geo[geo.geolocation_lng <=  -34.79314722]



from datashader.utils import lnglat_to_meters as webm
x, y = webm(geo.geolocation_lng, geo.geolocation_lat)
geo['x'] = pd.Series(x)
geo['y'] = pd.Series(y)

# transforming the prefixes to int for plotting purposes
geo['geolocation_zip_code_prefix'] = geo['geolocation_zip_code_prefix'].astype(int)
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix_1_digits'].astype(int)
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix_2_digits'].astype(int)
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix_3_digits'].astype(int)
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix_4_digits'].astype(int)

brazil = geo
agg_name = 'geolocation_zip_code_prefix'

# plot wtih holoviews + datashader - bokeh with map background
output_notebook()
hv.extension('bokeh')

%opts Overlay [width=800 height=600 toolbar='above' xaxis=None yaxis=None]
%opts QuadMesh [tools=['hover'] colorbar=True] (alpha=0 hover_alpha=0.2)

T = 0.05
PX = 1

def plot_map(data, label, agg_data, agg_name, cmap):
    url="http://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{Z}/{Y}/{X}.png"
    geomap = gv.WMTS(url)
    points = hv.Points(gv.Dataset(data, kdims=['x', 'y'], vdims=[agg_name]))
    agg = datashade(points, element_type=gv.Image, aggregator=agg_data, cmap=cmap)
    zip_codes = dynspread(agg, threshold=T, max_px=PX)
    hover = hv.util.Dynamic(rasterize(points, aggregator=agg_data, width=50, height=25, streams=[RangeXY]), operation=hv.QuadMesh)
    hover = hover.options(cmap=cmap)
    img = geomap * zip_codes * hover
    img = img.relabel(label)
    return img

orders_df = pd.read_csv('../dataset/olist_orders_dataset.csv')
order_items = pd.read_csv('../dataset/olist_order_items_dataset.csv')
order_reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv')
customer = pd.read_csv('../dataset/olist_customers_dataset.csv', dtype={'customer_zip_code_prefix': str})

# getting the first 3 digits of customer zipcode
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix'].str[0:3]
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix_3_digits'].astype(int)

brazil_geo = geo.set_index('geolocation_zip_code_prefix_3_digits').copy()

# merging the data
orders = orders_df.merge(order_items, on='order_id')
orders = orders.merge(customer, on='customer_id')
orders = orders.merge(order_reviews, on='order_id')

orders['order_delivered_customer_date'] = pd.to_datetime(orders.order_delivered_customer_date)
orders['order_estimated_delivery_date'] = pd.to_datetime(orders.order_estimated_delivery_date)
orders['order_delivered_carrier_date'] = pd.to_datetime(orders.order_delivered_carrier_date)
orders['actual_delivery_time'] = orders.order_delivered_customer_date - orders.order_delivered_carrier_date
orders['actual_delivery_time'] = orders['actual_delivery_time'].dt.days

gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']

plot_map(score, 'Orders Average Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
Loading BokehJS ...
Out[1]:
In [2]:
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date'] 
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)

plot_map(order_delay, 'Orders Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
Out[2]:
In [3]:
gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
gp = gp.loc[(gp['review_score'] < 3.5)]
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']

plot_map(score, 'Orders Bad Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
Out[3]:
In [4]:
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date'] 
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp.loc[(gp['delayed'] > 0.2)]
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)

plot_map(order_delay, 'Orders With Big Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
Out[4]:
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns;

orders = pd.read_csv('../dataset/olist_orders_dataset.csv', delimiter = ',')
customers = pd.read_csv('../dataset/olist_customers_dataset.csv', delimiter = ',')
reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv', delimiter = ',')

final = pd.merge(orders, customers, on='customer_id')
final = pd.merge(final, reviews, on='order_id')

final['is_delayed'] = final['order_delivered_customer_date'] > final['order_estimated_delivery_date']

final['order_delivered_customer_date'] = pd.to_datetime(final['order_delivered_customer_date'])
final['order_estimated_delivery_date'] = pd.to_datetime(final['order_estimated_delivery_date'])
final['time_delay'] = final['order_delivered_customer_date'] - final['order_estimated_delivery_date']

customer_delay = final.groupby('order_id', as_index=False).agg({'is_delayed': ['sum', 'count']})
customer_delay['delay'] = customer_delay['is_delayed']['sum'] / customer_delay['is_delayed']['count']

state_delay = final.groupby('customer_state', as_index=False).agg({'is_delayed': ['sum', 'count']})
state_delay['delay'] = state_delay['is_delayed']['sum'] / state_delay['is_delayed']['count']

customer_delay = customer_delay.sort_values(by='delay', ascending=False)
state_delay = state_delay.sort_values(by='delay', ascending=False)

state_reviews = final.groupby('customer_state', as_index=False)['review_score'].mean()
state_reviews = pd.merge(state_reviews, state_delay, on='customer_state')
state_reviews['delay'] = state_reviews[('delay', '')]
state_reviews['delayed'] = state_reviews[('is_delayed', 'sum')]
state_reviews['total_orders'] = state_reviews[('is_delayed', 'count')]
state_reviews = state_reviews[['customer_state','review_score','delay','delayed','total_orders']]

state_reviews = state_reviews.sort_values('delay', ascending=False)

state_reviews.head(27)
Out[5]:
customer_state review_score delay delayed total_orders
1 AL 3.731415 0.230216 96.0 417
9 MA 3.745672 0.187750 141.0 751
16 PI 3.907258 0.153226 76.0 496
5 CE 3.837939 0.147125 197.0 1339
24 SE 3.800000 0.145714 51.0 350
4 BA 3.834314 0.135668 461.0 3398
18 RJ 3.853442 0.129466 1674.0 12930
26 TO 4.100000 0.125000 35.0 280
7 ES 4.009790 0.119922 245.0 2043
13 PA 3.827902 0.119145 117.0 982
11 MS 4.115702 0.112948 82.0 726
21 RR 3.608696 0.108696 5.0 46
14 PB 4.007449 0.106145 57.0 537
19 RN 4.075975 0.104723 51.0 487
15 PE 3.993393 0.103303 172.0 1665
23 SC 4.055875 0.095316 348.0 3651
8 GO 4.026019 0.079529 162.0 2037
22 RS 4.125658 0.070093 386.0 5507
6 DF 4.056019 0.068981 149.0 2160
12 MT 4.087912 0.065934 60.0 910
25 SP 4.160579 0.057021 2393.0 41967
10 MG 4.121049 0.054758 641.0 11706
17 PR 4.167654 0.048973 248.0 5064
3 AP 4.176471 0.044118 3.0 68
2 AM 4.154362 0.040268 6.0 149
0 AC 4.049383 0.037037 3.0 81
20 RO 4.055336 0.027668 7.0 253

States with biggest Average Delay (%)

In [6]:
#dado.reset_index(inplace=True)
sns.set(rc={'figure.figsize':(15,13)})
ax = sns.barplot(x="delay", y="customer_state", data=(state_reviews.head(27)))

States with biggest Average Reviews

In [7]:
state_reviews = state_reviews.sort_values('review_score', ascending=True)
ax = sns.barplot(x="review_score", y="customer_state", data=state_reviews.head(27))
ax.set(xlim=(3.5, 4.2))
Out[7]:
[(3.5, 4.2)]
In [8]:
delay = final.groupby(["is_delayed"]).size().reset_index(name="amount")
delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (delay*100)),'% of all orders are delayed\n')

delay = final.loc[(final['review_score'] <= 3)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
bad_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (bad_delay*100)),'% of the orders with bad reviews are delayed\n')

delay = final.loc[(final['review_score'] >= 4)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
good_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (good_delay*100)),'% of the orders with good reviews are delayed \n')

print(("%.2f" % ((1-bad_delay)*100)),'% of the orders with bad review are not related to delay\n')
print(("%.2f" % ((1-good_delay)*100)),'% of the orders with good review are not delayed \n')
8.54 % of all orders are delayed

28.51 % of the orders with bad reviews are delayed

3.62 % of the orders with good reviews are delayed 

71.49 % of the orders with bad review are not related to delay

96.38 % of the orders with good review are not delayed